Binary Classification Task

Boru Chen

In [ ]:
%matplotlib inline

import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

pd.set_option('display.max_columns', None)

Load Data

  • One-hot encoding
  • Split data:
    • Training set: 90%
    • Test set: 10%
In [ ]:
raw = pd.read_csv("./StrayAnimalsAnalysis/clean1.csv")

# # Drop data without image
# filename = list(map(lambda x: str(x) + "_front_org.jpg", raw["KeyNo"]))
# img_root = "./images"
# without_img = []
# for path in filename:
#     if(not os.path.exists(os.path.join(img_root, path))):
#         without_img.append(path.split('_')[0])

raw.set_index("KeyNo" , inplace=True)
# raw.drop(list(map(lambda x: int(x),without_img)), inplace=True)

y = raw["Adopted"]
df = raw.drop(columns=["Adopted", "StayDays", "_Situation2Name", "StatusName"])
df_ohe = pd.get_dummies(df)

X_train, X_test, y_train, y_test = train_test_split(df, y,test_size=0.1, random_state=42)
X_train_ohe, X_test_ohe, y_train, y_test = train_test_split(df_ohe, y,test_size=0.1, random_state=42)

raw.head()
Out[ ]:
TypeIdName DistrictTeamName SexName ReasonName VarietyName VarietyRemark BuildName Bodyweight Age2Name CoatName CollarName ThoracodorsalName AdoptionName NotAdoptionName _Situation2Name StatusName IsSterilizationName EarNoteName LengthCoatName HealthStatusName ShelterName Adopted StayDays Behavior89 Behavior85 Behavior84 Behavior90 Behavior87 Behavior83 Behavior86 Behavior88
KeyNo
27025 壽豐鄉 政府捕捉 混種狗 15~20 1至3歲 虎斑 無項圈 無衣服 適合認養 認養 認領養 短毛 正常 花蓮縣流浪犬中途之家 1 193 0 1 0 0 0 1 0 0
27027 瑞穗鄉 政府捕捉 混種狗 15~20 1至3歲 黑白 無項圈 無衣服 適合認養 認養 認領養 短毛 正常 花蓮縣流浪犬中途之家 1 154 0 1 1 0 0 1 0 0
27034 秀林鄉 政府捕捉 混種狗 10~15 3至7歲 無項圈 無衣服 適合認養 認養 認領養 正常 花蓮縣流浪犬中途之家 1 118 0 1 0 0 0 1 0 0
27035 燕巢區 政府捕捉 混種狗 1至3歲 無項圈 無衣服 適合認養 認養 認領養 正常 高雄市燕巢動物保護關愛園區 1 811 0 0 0 0 0 0 0 0
27036 學甲區 拾獲送交 混種貓 3至6月齡 黑黃 無項圈 無衣服 適合認養 認養 認領養 未確定 臺南市動物之家善化站 1 204 0 0 0 0 0 0 0 0
In [ ]:
X_train.head()
Out[ ]:
TypeIdName DistrictTeamName SexName ReasonName VarietyName VarietyRemark BuildName Bodyweight Age2Name CoatName CollarName ThoracodorsalName AdoptionName NotAdoptionName IsSterilizationName EarNoteName LengthCoatName HealthStatusName ShelterName Behavior89 Behavior85 Behavior84 Behavior90 Behavior87 Behavior83 Behavior86 Behavior88
KeyNo
84517 鶯歌區 政府捕捉 混種貓 黑黃 無項圈 無衣服 暫時不適合 未確定 新北市板橋區公立動物之家 0 0 0 0 0 0 0 0
36167 五股區 拾獲送交 混種狗 無項圈 無衣服 適合認養 未確定 正常 新北市五股區公立動物之家 0 0 0 0 0 0 0 0
49816 新店區 政府捕捉 混種狗 黑白 無項圈 無衣服 適合認養 未確定 新北市新店區公立動物之家 0 0 0 0 0 0 0 0
110035 桃園區 政府捕捉 混種狗 未離乳 黑白 無項圈 無衣服 暫時不適合 未確定 桃園市動物保護教育園區 0 0 0 0 0 0 0 0
51266 鶯歌區 拾獲送交 混種貓 無項圈 無衣服 適合認養 未確定 短毛 新北市板橋區公立動物之家 0 0 0 0 0 0 0 0
In [ ]:
print("Training Dataset")
print("nrow:", len(X_train))
print("label")
print("1:", sum(y_train))
print("0:", sum(1-y_train))
print()
print("Test Dataset")
print("nrow:", len(X_test))
print("label")
print("1:", sum(y_test))
print("0:", sum(1-y_test))
Training Dataset
nrow: 59784
label
1: 54488
0: 5296

Test Dataset
nrow: 6643
label
1: 6074
0: 569

Logistic Regression

PCA

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler().fit(X_train_ohe)
X_train_std = scaler.transform(X_train_ohe)
X_test_std = scaler.transform(X_test_ohe)

pca = PCA(n_components = 0.8).fit(X_train_std)
X_train_pca = pca.transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
In [ ]:
from sklearn.linear_model import LogisticRegressionCV

LR_L2 = LogisticRegressionCV(cv=5, random_state=0, solver='saga', max_iter = 10000, n_jobs = -1).fit(X_train_pca, y_train)
LR_L2.predict(X_test_pca)
In [ ]:
from sklearn.externals import joblib
joblib.dump(LR_L2, './saved_model/LR_L2.joblib')
Out[ ]:
['./saved_model/LR_L2.joblib']
In [ ]:
LR_L2 = joblib.load('saved_model/LR_L2.joblib')
LR_L2_predict_prob = LR_L2.predict_proba(X_test_pca)[:,1]
LR_L2_predict = LR_L2.predict(X_test_pca)

from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, f1_score

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()

fpr, tpr, thresholds = roc_curve(y_test, LR_L2_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, LR_L2_predict_prob)
# accuracy_score(y_test, LR_L2_predict)
Out[ ]:
0.8504128634943489

Random Forest

In [ ]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier().fit(X_train_ohe, y_train)
In [ ]:
RFC_predict_prob = RFC.predict_proba(X_test_ohe)[:, 1]
RFC_predict = RFC.predict(X_test_ohe)

fpr, tpr, thresholds = roc_curve(y_test, RFC_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, RFC_predict_prob)
Out[ ]:
0.9111421929767201

kNN

We are going to do grid search with cross-validation for the hyperparameter k.

In [ ]:
r = np.power(len(X_train_std)/4, 1/10)
k_range = [1]
for i in range(1,10):
    k_range.append(k_range[i-1]*r)

k_range = np.array(list(map(round, k_range))).astype('int')
k_range
Out[ ]:
array([   1,    3,    7,   18,   47,  122,  320,  836, 2186, 5716])
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_std, y_train, cv=5, scoring='accuracy')
    k_scores.append(scores.mean())

# save the result 
joblib.dump(k_scores, './saved_model/k_scores.joblib')
Out[ ]:
['./saved_model/k_scores.joblib']
In [ ]:
k_scores = joblib.load('saved_model/k_scores.joblib')
best_k = k_range[np.argmax(k_scores)]
print("max: accuracy", max(k_scores))
print("Set hyperparameter k = ", best_k)
print()
# plot to see clearly
plt.plot(k_range, k_scores)
plt.xlabel('Value of k for kNN')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Accuracy with respect to different k')
plt.savefig("kNN_k.png")
plt.show()
max: accuracy 0.9212498530686226
Set hyperparameter k =  7

Model after tuning

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors = best_k)
kNN.fit(X_train_std, y_train)

kNN_predict_prob = kNN.predict_proba(X_test_std)[:, 1]
kNN_predict = kNN.predict(X_test_std)
In [ ]:
fpr, tpr, thresholds = roc_curve(y_test, kNN_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, kNN_predict_prob)
Out[ ]:
0.8341120035091516

Multi-layer Perceptron classifier

In [ ]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier().fit(X_train_std, y_train)
In [ ]:
MLP_predict_prob = MLP.predict_proba(X_test_std)[:, 1]
MLP_predict = MLP.predict(X_test_std)

fpr, tpr, thresholds = roc_curve(y_test, MLP_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, MLP_predict_prob)
Out[ ]:
0.8752081099364429

AdaBoost Classifier

In [ ]:
from sklearn.ensemble import AdaBoostClassifier

AdaBoost = AdaBoostClassifier().fit(X_train_ohe, y_train)
In [ ]:
AdaBoost_predict_prob = AdaBoost.predict_proba(X_test_ohe)[:, 1]
AdaBoost_predict = AdaBoost.predict(X_test_ohe)

fpr, tpr, thresholds = roc_curve(y_test, AdaBoost_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, AdaBoost_predict_prob)
Out[ ]:
0.8545156022413665

Linear Discriminant Analysis

In [ ]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

LDA = LinearDiscriminantAnalysis().fit(X_train_std, y_train)
In [ ]:
LDA_predict_prob = LDA.predict_proba(X_test_std)[:, 1]
LDA_predict = LDA.predict(X_test_std)

fpr, tpr, thresholds = roc_curve(y_test, LDA_predict_prob)
plot_roc_curve(fpr, tpr)
roc_auc_score(y_test, LDA_predict_prob)
Out[ ]:
0.8563472879593392

Result

In [ ]:
predict = [LR_L2_predict, kNN_predict, LDA_predict, AdaBoost_predict, RFC_predict, MLP_predict]
predict_prob = [LR_L2_predict_prob, kNN_predict_prob, LDA_predict_prob, AdaBoost_predict_prob, RFC_predict_prob, MLP_predict_prob]

f1 = []
for i in predict:
    f1.append(f1_score(y_test, i))

auc = []
for i in predict_prob:
    auc.append(roc_auc_score(y_test, i))
In [ ]:
f1_round = list(map(lambda x: round(x,4), f1))
auc_round = list(map(lambda x: round(x,4), auc))
print(f1_round)
print(auc_round)
[0.9568, 0.9604, 0.9602, 0.957, 0.9666, 0.9604]
[0.8504, 0.8341, 0.8563, 0.8545, 0.9111, 0.8752]
In [ ]:
fig = plt.figure(figsize = (6,6), facecolor = 'white', dpi = 300)
ax = fig.add_subplot(111) 
ax.set_xlabel('X1', fontsize = 12)
ax.set_ylabel('X2', fontsize = 12)
ax.set_title('ROC', fontsize = 14)
ax.plot([0, 1], [0, 1], color='darkgray', linestyle='--')

colors = plt.cm.rainbow(np.linspace(0,1,6))
labels = ["Logistic Regression", "kNN", "LDA", "AdaBoost", "Random Forest", "MLP"]
for i in range(6):
    fpr, tpr, thresholds = roc_curve(y_test, predict_prob[i])
    ax.plot(fpr, tpr, color = colors[i], label = labels[i], alpha = 0.6)
ax.legend()

fig.savefig("ROC.png", bbox_inches = 'tight')

# plt.plot(fpr, tpr, color='orange', label='ROC')
# plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend()
# plt.show()

Data Visualization

We propose t-SNE here

In [ ]:
from sklearn.manifold import TSNE

#Cosine metric
tsne_cos = TSNE(n_components=2, metric = 'cosine', random_state = 42)
df_tsne_cos = pd.DataFrame(tsne_cos.fit_transform(df_ohe), columns = ["X1", "X2"])
# joblib.dump(df_tsne_cos, './saved_model/df_tsne_cos.joblib')
In [ ]:
df_tsne_cos = joblib.load('saved_model/df_tsne_cos.joblib')
# df_tsne_cos.reset_index(inplace = True)

df_tsne_cos["y"] = list(y)

fig = plt.figure(figsize = (21,30), facecolor = 'white', dpi = 300)
ax1 = fig.add_subplot(311) 
ax1.set_xlabel('X1', fontsize = 15)
ax1.set_ylabel('X2', fontsize = 15)
ax1.set_title('t-SNE with Cosine Metric', fontsize = 20)

ax1.scatter(df_tsne_cos["X1"][df_tsne_cos["y"] == 1]
            , df_tsne_cos["X2"][df_tsne_cos["y"] == 1]
            , color = "orange"
            , alpha = 0.6
            , label = "0")

ax1.scatter(df_tsne_cos["X1"][df_tsne_cos["y"] == 0]
            , df_tsne_cos["X2"][df_tsne_cos["y"] == 0]
            , color = "green"
            , alpha = 0.6
            , label = "1")

ax1.set_xlim(-75, 75)
ax1.legend(fontsize = 14)
ax1.grid()